cd C:\Users\vfan\Dropbox\Lancet\PubMed+ScienceDirect\PubMed

** THIS PARTICULAR DATASET WAS CONSTRUCTED AS FOLLOWS. FIRST, FROM PUBMED WE DOWNLOADED THE 3095 LETTERS/COMMENTS AS A CSV DIRECT
** AND MERGED THIS IN BIBTEX TO THE SAME DATASET BUT HAD PARSED THE PUBLICATION DATE INTO SEPARATE VARIABLES
** THIS DATASET WAS THEN MERGED TO THE PUBMED DOWNLOADED DATASET OF 7529 ARTICLES TO CREATE THE FULL DATASET
** BUT DEMARCATES WHICH ARE CORRESPONDENCE LETTERS/COMMENTS
use "pubmed_result_letters_comments_2008-12_abstract_CSVdirect-merged-BibTex_MERGED-ALL-CSVdirect.dta",clear
count /*7529 articles including both research articles as well as comments/letters */

** NEXT WE MERGE FULL 7529 DATASET TO DATABASE WHICH ONLY HAS INFORMATION ON KEYWORDS --> NECESSARY FOR CLASSIFYING 'GHSS'
sort entrezuid
merge entrezuid using pubmed_result_ALL_2008-12_abstract_keywords
tab _merge /*perfect merge */
drop _merge
describe custom4 custom5
sum custom4 custom5
codebook custom4 custom5
drop custom4 custom5

** NOTE THAT KEY REFERS TO THE GHSS ARTICLE TYPE USED IN THE PAPER
** 'custom3' vaariable has information on the articles keywords
g key = .
replace key = 1 if strpos(custom3, "Global") > 0 | strpos(custom3, "World") > 0 | strpos(custom3, "Developing Countries") > 0 | strpos(custom3, "International") > 0 | strpos(custom3, "Nations") > 0 | strpos(custom3, "Middle East") > 0 | strpos(custom3, "Africa") > 0 | strpos(custom3, "Asia") > 0 | strpos(custom3, "America*") > 0 | strpos(custom3, "Social") > 0 | strpos(custom3, "Economic") > 0 | strpos(custom3, "Political") > 0 
replace key = 0 if key == .

** CORRESPONDENCELETTER REFERS TO WHETHER THE ARTICLE IS A LETTER/COMMENT OR A RESEARCH ARTICLE
tab correspondenceletter
replace correspondenceletter = 0 if correspondenceletter == .
tab key correspondenceletter,row chi
tab key correspondenceletter,col chi
tab key if correspondenceletter == 0

** ADDITIONAL CHECK OF WHETHER GHSS/KEY ARTICLES HAVE MORE OR LESS KEYWORDS; THEY HAVE MORE KEYWORDS THAN NON-GHSS ARTICLES
sort key
by key: sum numberkeywords
ttest numberkeywords, by(key)
save "pubmed_result_letters_comments_2008-12_abstract_CSVdirect-merged-BibTex_MERGED-ALL-CSVdirect-KEYWORDS.dta"





** THE PREVIOUS DATASET DOES NOT LINK LETTERS/CORRESPONDENCES TO THEIR RESPECTIVE RESEARCH ARTICLES. 
** THEREFORE TO DO THIS WE DO THE FOLLOWING PROCEDURE
** FIRST, 
* first letters comments csvdirect was merged to the BibTex database
* then the all articles csvdirect was merged
* keywords of all articles was merged
* TXT file which had info on the commented articles was merged

use "pubmed_result_letters_comments_2008-12_abstract_CSVdirect-merged-BibTex_MERGED-ALL-CSVdirect-KEYWORDS.dta",clear
sort entrezuid
merge entrezuid using pubmed_result_TXT /* THIS DATASET OF COMMENTS/LETTERS WAS DOWNLOADED FROM PUBMED AS A TXT BECAUSE IT HAD A VARIABLE 'COMMENT ON' */
tab _merge
ren _merge mergewithTXT

** SHOWS THAT THE COMMENTON VARIABLE ONLY TRUE FOR CORRESPONDENCE LETTERS
tab commenton if correspondenceletter  == 0
tab commenton if correspondenceletter  == 1

save "pubmed_result_final_2014-03-24.dta"

** HERE DETAILSTOMERGE2 WAS CODED IN EXCEL (pubmed_result_final_2014-03-24.xls) -> "new" tab
save "pubmed_result_final_2014-03-24_v3.dta",replace

use "pubmed_result_final_2014-03-24_v3.dta",clear
codebook details detailstomerge2 commenton
codebook commenton /* 1705 unique citations commenting on something else */
codebook commenton if correspondenceletter == 1
codebook commenton if correspondenceletter == 0

g detailstomerge3 = detailstomerge2
 tab mergewithtxt mergelettersall /*THERE ARE 313 OF 3317 LETTERS WHICH DID NOT NOTE WHAT ARTICLE THEY COMMENT ON */
replace detailstomerge3 = commenton if mergewithtxt == 3
codebook details detailstomerge2 detailstomerge3 commenton
g hascommenton = 1 if commenton != ""
tab hascommenton
replace hascommenton = 0 if hascommenton != 1
tab hascommenton
save "pubmed_result_final_2014-03-24_v4.dta" /* compared to "pubmed_result_final_2014-03-24.dta" this has added detailstomerge toremove detailstomerge2 and detailstomerge3 */

tab mergewithtxt
tab mergewithtxt correspondenceletter
g count = correspondenceletter
collapse (sum) count key, by(detailstomerge3 mergewithtxt)  
tab mergewithtxt
count
label var count "count of correspondenceletters associated with this reference detailstomerge3"
*tab key correspondenceletter,row chi
g key2 = key >0
g cl2 = count >0
tab key2 cl2, row chi /*the number of research articles for which */
keep if mergewithtxt == 3
count

g correspondenceletter = 0
drop mergewithtxt
sort correspondenceletter detailstomerge3
save "pubmed_result_final_2014-03-24_v4_formerge.dta"
** this file has the number of letters/comments associated per research article (correspondenceletter = 0) reference detailstomerge3

use "pubmed_result_final_2014-03-24_v4.dta", clear
sort correspondenceletter detailstomerge3 /* we merge only to research articles (correspondenceletter = 0) */
merge correspondenceletter detailstomerge3 using "pubmed_result_final_2014-03-24_v4_formerge" 
tab _merge
tab _merge correspondenceletter
di 3392+1042
drop if _merge == 2
tab count
count
replace count = 0 if count == .
tab cl2
replace cl2 = 0 if cl2 == .
ttest cl2 if correspondenceletter == 0, by(key)
ttest count if correspondenceletter == 0, by(key)
ttest count if correspondenceletter == 0 & count > 0, by(key)
save "pubmed_result_final_2014-03-24_v5.dta"
